Import library

library(dada2)
packageVersion("dada2") # check dada2 version
## [1] '1.21.0'
library(Biostrings)
library(ShortRead)
library(seqTools) # per base sequence content
library(phyloseq)
library(ggplot2)
library(data.table)
library(plyr)
library(dplyr)
library(qckitfastq) # per base sequence content
library(stringr)

QUALITY CHECK

First, we import the fastq files containing the raw reads. The samples were downloaded from the SRA database with the accession number PRJEB37924. Only sigmoid biopsy samples that were analyzed by 16S rRNA sequencing were downloaded (see 00_Metadata-Mars).

# Save the path to the directory containing the fastq zipped files
path <- "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/Mars-2020"
# list.files(path) # check we are in the right directory

# fastq filenames have format: SAMPLENAME.fastq.gz
# Saves the whole directory path to each file name
fnFs <- sort(list.files(file.path(path, "original"), pattern="_1.fastq.gz", full.names = TRUE)) # FWD reads
fnRs <- sort(list.files(file.path(path, "original"), pattern="_2.fastq.gz", full.names = TRUE)) # REV reads
show(fnFs[1:5])
show(fnRs[1:5])

# Extract sample names, assuming filenames have format: SAMPLENAME.fastq.gz
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
show(sample.names[1:5]) # saves only the file name (without the path)

# Look at quality of all files
for (i in 1:3){  # 1:length(fnFs)
  show(plotQualityProfile(fnFs[i]))
  show(plotQualityProfile(fnRs[i]))
}

# Look at nb of reads per sample
# raw_stats <- data.frame('sample' = sample.names,
#                         'reads' = fastqq(fnFs)@nReads)
# min(raw_stats$reads)
# max(raw_stats$reads)
# mean(raw_stats$reads)

We will have a quick peak at the per base sequence content of the reads in some samples, to make sure there is no anomaly (i.e. all reads having the same sequence).

# Look at per base sequence content (forward read)
fseqF1 <- seqTools::fastqq(fnFs[10])
## [fastqq] File ( 1/1) '/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/Mars-2020/original/ERR5169592_1.fastq.gz'    done.
rcF1 <- read_content(fseqF1)
plot_read_content(rcF1) + labs(title = "Per base sequence content - Forward read")

fseqR1 <- seqTools::fastqq(fnRs[10])
## [fastqq] File ( 1/1) '/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/Mars-2020/original/ERR5169592_2.fastq.gz'    done.
rcR1 <- read_content(fseqR1)
plot_read_content(rcR1) + labs(title = "Per base sequence content - Reverse read")

LOOK FOR PRIMERS

Now, we will look whether the reads still contain the primers. The methods section indicates that the V4 region of the 16S rRNA gene was amplified as described in the paper by Gohl et al.. Primer sequences are shared in that paper.

# V4
FWD <- "GTGCCAGCMGCCGCGGTAA"  # F515 forward primer sequence
REV <- "GGACTACHVGGGTWTCTAAT" # R806 reverse primer sequence 

# Function that, from the primer sequence, will return all combinations possible (complement, reverse complement, ...)
allOrients <- function(primer) {
    # Create all orientations of the input sequence
    require(Biostrings)
    dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
    orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), 
        RevComp = reverseComplement(dna))
    return(sapply(orients, toString))
}

# Get all combinations of the primer sequences
FWD.orients <- allOrients(FWD) # F515
REV.orients <- allOrients(REV) # R806
FWD.orients # sanity check
##               Forward            Complement               Reverse 
## "GTGCCAGCMGCCGCGGTAA" "CACGGTCGKCGGCGCCATT" "AATGGCGCCGMCGACCGTG" 
##               RevComp 
## "TTACCGCGGCKGCTGGCAC"
REV.orients
##                Forward             Complement                Reverse 
## "GGACTACHVGGGTWTCTAAT" "CCTGATGDBCCCAWAGATTA" "TAATCTWTGGGVHCATCAGG" 
##                RevComp 
## "ATTAGAWACCCBDGTAGTCC"
# Function that counts number of reads in which a sequence is found
primerHits <- function(primer, fn) {
    nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE, max.mismatch = 2)
    return(sum(nhits > 0))
}

# Get a table to know how many times the U515 and E786 primers are found in the reads of each sample
for (i in 1:3){
  cat("SAMPLE", sample.names[i], "with total number of", raw_stats[i,'reads'], "reads\n\n")
  x <- rbind(ForwardRead.FWDPrimer = sapply(FWD.orients, primerHits, fn = fnFs[[i]]),
             ForwardRead.REVPrimer = sapply(REV.orients, primerHits, fn = fnFs[[i]]),
             ReverseRead.FWDPrimer = sapply(FWD.orients, primerHits, fn = fnRs[[i]]), 
             ReverseRead.REVPrimer = sapply(REV.orients, primerHits, fn = fnRs[[i]]))
  print(x)
  cat("\n____________________________________________\n\n")
}
## SAMPLE ERR5169583 with total number of 105593 reads
## 
##                       Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer  102879          0       0       4
## ForwardRead.REVPrimer       4          0       0   96739
## ReverseRead.FWDPrimer       4          0       0   83295
## ReverseRead.REVPrimer  103571          0       0       3
## 
## ____________________________________________
## 
## SAMPLE ERR5169584 with total number of 129697 reads
## 
##                       Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer  125832          0       0      13
## ForwardRead.REVPrimer      13          0       0  118601
## ReverseRead.FWDPrimer      13          0       0  103070
## ReverseRead.REVPrimer  127174          0       0      13
## 
## ____________________________________________
## 
## SAMPLE ERR5169585 with total number of 70485 reads
## 
##                       Forward Complement Reverse RevComp
## ForwardRead.FWDPrimer   68860          0       0       1
## ForwardRead.REVPrimer       1          0       0   64299
## ReverseRead.FWDPrimer       1          0       0   55897
## ReverseRead.REVPrimer   69315          0       0       1
## 
## ____________________________________________

FILTER AND TRIM

1. Primer removal

Almost all reads contain the forward & reverse primers. Ideally, to follow the standardized pipeline, we should (1) filter out reads not containing any primer and (2) trim the primers. However, the fastq files deposited on the SRA/ENA databases do not contain Illumina headers. Thus, during the merging step of paired reads later on, the Dada2 package won’t be able to match paired-reads.

We will simply perform a quality filtering of the reads, and trim the first 20 and last 30 bases of the reads (containing the primers)

2. Quality filtering

Then, we perform a quality filtering of the reads.

# Place filtered files in a filtered/ subdirectory
FWD.filt2_samples <- file.path(path, "filtered2", paste0(sample.names, "_F_filt2.fastq.gz")) # FWD reads
REV.filt2_samples <- file.path(path, "filtered2", paste0(sample.names, "_R_filt2.fastq.gz")) # REV reads
# Assign names for the filtered fastq.gz files
names(FWD.filt2_samples) <- sample.names
names(REV.filt2_samples) <- sample.names

# Filter
out2 <- filterAndTrim(fwd = fnFs, filt = FWD.filt2_samples,
                      rev = fnRs, filt.rev = REV.filt2_samples,
                      trimLeft = 20, trimRight=30, # trim primers
                      maxEE=3, # reads with more than 3 expected errors (sum(10e(-Q/10))) are discarded
                      truncQ=10, # Truncate reads at the first instance of a quality score less than or equal to truncQ.
                      minLen=150, # Discard reads shorter than 150 bp.
                      compress=TRUE,
                      multithread = TRUE,
                      verbose=TRUE)

Let’s look at the output filtered fastq files as sanity check.

out2[1:6,] # show how many reads were filtered in each file
##                       reads.in reads.out
## ERR5169583_1.fastq.gz   105593     67882
## ERR5169584_1.fastq.gz   129697     84891
## ERR5169585_1.fastq.gz    70485     46101
## ERR5169586_1.fastq.gz    86720     56496
## ERR5169587_1.fastq.gz    88688     58512
## ERR5169588_1.fastq.gz    44429     21535
# Look at quality profile of all filtered files
for (i in 1:3){
  show(plotQualityProfile(FWD.filt2_samples[i]))
  show(plotQualityProfile(REV.filt2_samples[i]))
}

LEARN ERROR RATES

Now we will build the parametric error model, to be able to infer amplicon sequence variants (ASVs) later on.

errF <- learnErrors(FWD.filt2_samples, multithread=TRUE, randomize=TRUE, verbose = 1)
errR <- learnErrors(REV.filt2_samples, multithread=TRUE, randomize=TRUE, verbose = 1)

The error rates for each possible transition (A→C, A→G, …) are shown. Points are the observed error rates for each consensus quality score. The black line shows the estimated error rates after convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal definition of the Q-score. Here the estimated error rates (black line) are a good fit to the observed rates (points), and the error rates drop with increased quality as expected.

plotErrors(errF, nominalQ = TRUE) # Forward reads

plotErrors(errR, nominalQ = TRUE) # Reverse reads

CONSTRUCT SEQUENCE TABLE

1. Infer sample composition

The dada() algorithm infers sequence variants based on estimated errors (previous step). Firstly, we de-replicate the reads in each sample, to reduce the computation time. De-replication is a common step in almost all modern ASV inference (or OTU picking) pipelines, but a unique feature of derepFastq is that it maintains a summary of the quality information for each dereplicated sequence in $quals.

# Dereplicate the reads in the sample
derepF <- derepFastq(FWD.filt2_samples) # forward
derepR <- derepFastq(REV.filt2_samples) # reverse

# Infer sequence variants
dadaFs <- dada(derepF, err=errF, multithread=TRUE) # forward
dadaRs <- dada(derepR, err=errR, multithread=TRUE) # reverse
# Inspect the infered sequence variants
for (i in 1:3){
  print(dadaFs[[i]])
  print(dadaRs[[i]])
  print("________________")
}
## dada-class: object describing DADA2 denoising results
## 217 sequence variants were inferred from 13852 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 134 sequence variants were inferred from 16309 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 226 sequence variants were inferred from 16473 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 125 sequence variants were inferred from 19321 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 188 sequence variants were inferred from 9899 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## dada-class: object describing DADA2 denoising results
## 111 sequence variants were inferred from 11937 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"

2. Merge paired-end reads

We now need to merge paired reads.

mergers <- mergePairs(dadaFs, derepF, dadaRs, derepR, verbose=TRUE)
head(mergers[[1]])

3. Construct ASV table

We can now construct an amplicon sequence variant table (ASV) table, a higher-resolution version of the OTU table produced by traditional methods.

# Make sequence table from the infered sequence variants
seqtable <- makeSequenceTable(mergers)

# We should have 72 samples (72 rows)
dim(seqtable)
## [1]   72 3733
# Inspect distribution of sequence lengths
hist(nchar(getSequences(seqtable)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")

# Sequences should be between 515F - 806R, so around ~250bp (removing the primers lengths).
# Remove any sequence variant below outside 200-300bp
seqtable.new <- seqtable[,nchar(colnames(seqtable)) %in% 200:300]
dim(seqtable.new)
## [1]   72 1738
hist(nchar(getSequences(seqtable.new)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")

4. Remove chimeras

The core dada method corrects substitution and indel errors, but chimeras remain. Fortunately, the accuracy of sequence variants after denoising makes identifying chimeric ASVs simpler than when dealing with fuzzy OTUs. Chimeric sequences are identified if they can be exactly reconstructed by combining a left-segment and a right-segment from two more abundant “parent” sequences.

seqtable.nochim <- removeBimeraDenovo(seqtable.new, method="consensus", multithread=TRUE, verbose=TRUE)

# Check how many sequence variants we have after removing chimeras
dim(seqtable.nochim)
## [1]   72 1678
# Keep only ASVs that are present in at least 2 samples
seqtable.nochim <- seqtable.nochim[, colSums(seqtable.nochim > 0) >=2]
dim(seqtable.nochim)
## [1]  72 827
# Check how many reads we have after removing chimeras (we should keep the vast majority of the reads, like > 80%)
sum(seqtable.nochim)/sum(seqtable.new)
## [1] 0.9637866

LOOK AT READS COUNT THROUGH PIPELINE

Sanity check before assigning taxonomy.

# Function that counts nb of reads
getN <- function(x) sum(getUniques(x))

# Table that will count number of reads for each process of interest (input reads, filtered reads, denoised reads, non chimera reads)
track <- cbind(out2,
               sapply(dadaFs, getN),
               sapply(dadaRs, getN),
               sapply(mergers, getN),
               rowSums(seqtable.nochim),
               lapply(rowSums(seqtable.nochim)*100/out2[,1], as.integer))

# Assign column and row names
colnames(track) <- c("input", "quality-filt", "denoisedF", "denoisedR", 'merged', 'nonchim', "%input->output")
rownames(track) <- sample.names

# Show final table: for each row/sample, we have shown the initial number of reads, filtered reads, denoised reads, and non chimera reads
track
##            input  quality-filt denoisedF denoisedR merged nonchim
## ERR5169583 105593 67882        67514     67468     64359  45932  
## ERR5169584 129697 84891        84386     84242     81354  62986  
## ERR5169585 70485  46101        45855     45742     44299  35776  
## ERR5169586 86720  56496        56148     56038     53806  45383  
## ERR5169587 88688  58512        58086     58008     55905  38513  
## ERR5169588 44429  21535        21249     21253     20536  15130  
## ERR5169589 97536  61839        61359     61297     60215  23782  
## ERR5169590 113298 73704        73246     73047     69120  44772  
## ERR5169591 70662  45311        44986     44815     41039  38649  
## ERR5169592 84848  53043        52654     52515     50040  35300  
## ERR5169593 70070  45741        45278     45200     43624  33305  
## ERR5169594 70514  46313        46066     45968     44669  28068  
## ERR5169595 81234  51884        51578     51518     49541  42853  
## ERR5169596 62654  39902        39785     39663     37516  35209  
## ERR5169597 123998 82637        82288     82277     81354  46665  
## ERR5169598 105876 71672        71412     71414     70545  26610  
## ERR5169599 105644 72969        72674     72648     72158  5822   
## ERR5169600 79270  50487        50177     50177     47739  45376  
## ERR5169601 76041  47762        47346     47320     45669  36877  
## ERR5169602 87122  59028        58789     58775     58113  3823   
## ERR5169603 73742  48317        48114     48084     47010  37102  
## ERR5169604 74329  46552        46278     46151     44192  39795  
## ERR5169605 55292  36072        35901     35734     34195  29688  
## ERR5169606 101878 66803        66575     66609     64616  61324  
## ERR5169607 83029  52595        52324     52211     50107  39044  
## ERR5169608 82800  56168        55822     55799     54485  18543  
## ERR5169609 73407  48392        48111     48109     45666  42398  
## ERR5169610 69347  47775        47518     47440     46274  19669  
## ERR5169611 70806  45827        45462     45284     43149  36600  
## ERR5169612 77124  52468        52161     52163     51271  18750  
## ERR5169613 68381  43427        43265     43233     41903  38396  
## ERR5169614 74287  46555        46216     46183     44623  34923  
## ERR5169615 64188  41889        41656     41529     39654  34782  
## ERR5169616 63924  40179        40011     39896     37615  30001  
## ERR5169617 91806  63077        62795     62710     61169  19089  
## ERR5169618 89209  60319        60007     60032     57906  27549  
## ERR5169619 1633   701          680       633       562    447    
## ERR5169620 72249  44767        44512     44430     42735  26880  
## ERR5169621 86114  55549        55331     55240     52425  34289  
## ERR5169622 52406  34619        34458     34469     32990  27632  
## ERR5169623 97958  61968        61563     61422     59478  38696  
## ERR5169624 65330  41531        41278     41111     38971  34505  
## ERR5169625 55228  34342        34109     34007     32495  25990  
## ERR5169626 75144  45993        45734     45536     42554  39371  
## ERR5169627 65796  41340        41062     40964     38875  33484  
## ERR5169628 67956  41265        40873     40627     37816  29111  
## ERR5169629 72891  47614        47288     47245     45526  34550  
## ERR5169630 83538  54506        54174     54242     51372  45837  
## ERR5169631 67169  42595        42353     42336     41079  32009  
## ERR5169632 53835  34606        34419     34322     32417  29395  
## ERR5169633 52155  34003        33830     33732     32101  23546  
## ERR5169634 76435  48921        48598     48472     45800  35697  
## ERR5169635 53442  34376        34049     33930     31764  26694  
## ERR5169636 41211  26655        26514     26453     25099  21816  
## ERR5169637 46747  29274        29102     29119     27417  21194  
## ERR5169638 76795  46229        45999     45990     45442  32260  
## ERR5169639 71235  42595        42413     42382     40974  29714  
## ERR5169640 39718  23558        23388     23360     22604  15244  
## ERR5169641 68115  44439        44340     44329     43235  35316  
## ERR5169642 17042  11307        11165     11140     11037  361    
## ERR5169643 26208  16351        16162     16060     15450  8082   
## ERR5169644 64295  40761        40620     40574     39237  32417  
## ERR5169645 50906  30554        30265     30319     28395  25414  
## ERR5169646 73257  47766        47571     47625     46251  30310  
## ERR5169647 8075   5306         5216      5196      5124   198    
## ERR5169648 7421   4659         4464      4441      4124   1112   
## ERR5169649 47727  31004        30826     30859     28775  27240  
## ERR5169650 75469  49539        49283     49101     46863  43176  
## ERR5169651 17381  10780        10699     10526     9696   5151   
## ERR5169652 14220  8886         8816      8677      8159   7466   
## ERR5169653 49556  32426        32132     32058     30825  25537  
## ERR5169654 83283  56457        55932     55870     54652  22196  
##            %input->output
## ERR5169583 43            
## ERR5169584 48            
## ERR5169585 50            
## ERR5169586 52            
## ERR5169587 43            
## ERR5169588 34            
## ERR5169589 24            
## ERR5169590 39            
## ERR5169591 54            
## ERR5169592 41            
## ERR5169593 47            
## ERR5169594 39            
## ERR5169595 52            
## ERR5169596 56            
## ERR5169597 37            
## ERR5169598 25            
## ERR5169599 5             
## ERR5169600 57            
## ERR5169601 48            
## ERR5169602 4             
## ERR5169603 50            
## ERR5169604 53            
## ERR5169605 53            
## ERR5169606 60            
## ERR5169607 47            
## ERR5169608 22            
## ERR5169609 57            
## ERR5169610 28            
## ERR5169611 51            
## ERR5169612 24            
## ERR5169613 56            
## ERR5169614 47            
## ERR5169615 54            
## ERR5169616 46            
## ERR5169617 20            
## ERR5169618 30            
## ERR5169619 27            
## ERR5169620 37            
## ERR5169621 39            
## ERR5169622 52            
## ERR5169623 39            
## ERR5169624 52            
## ERR5169625 47            
## ERR5169626 52            
## ERR5169627 50            
## ERR5169628 42            
## ERR5169629 47            
## ERR5169630 54            
## ERR5169631 47            
## ERR5169632 54            
## ERR5169633 45            
## ERR5169634 46            
## ERR5169635 49            
## ERR5169636 52            
## ERR5169637 45            
## ERR5169638 42            
## ERR5169639 41            
## ERR5169640 38            
## ERR5169641 51            
## ERR5169642 2             
## ERR5169643 30            
## ERR5169644 50            
## ERR5169645 49            
## ERR5169646 41            
## ERR5169647 2             
## ERR5169648 14            
## ERR5169649 57            
## ERR5169650 57            
## ERR5169651 29            
## ERR5169652 52            
## ERR5169653 51            
## ERR5169654 26

ASSIGN TAXONOMY

Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files.

# Assign taxonomy (with silva v138)
taxa <- assignTaxonomy(seqtable.nochim, "~/Projects/IBS_Meta-analysis_16S/data/silva_nr_v138_train_set.fa.gz",
                       tryRC = TRUE, # try reverse complement of the sequences
                       multithread=TRUE, verbose = TRUE)

# Add species assignment
taxa <- addSpecies(taxa, "~/Projects/IBS_Meta-analysis_16S/data/silva_species_assignment_v138.fa.gz")
# Check how the taxonomy table looks like
taxa.print <- taxa 
rownames(taxa.print) <- NULL # Removing sequence rownames for display only
head(taxa.print)
##      Kingdom    Phylum         Class         Order            
## [1,] "Bacteria" "Firmicutes"   "Clostridia"  "Lachnospirales" 
## [2,] "Bacteria" "Bacteroidota" "Bacteroidia" "Bacteroidales"  
## [3,] "Bacteria" "Firmicutes"   "Clostridia"  "Lachnospirales" 
## [4,] "Bacteria" "Firmicutes"   "Clostridia"  "Oscillospirales"
## [5,] "Bacteria" "Firmicutes"   "Clostridia"  "Oscillospirales"
## [6,] "Bacteria" "Firmicutes"   "Clostridia"  "Lachnospirales" 
##      Family            Genus              Species         
## [1,] "Lachnospiraceae" "Blautia"          NA              
## [2,] "Bacteroidaceae"  "Bacteroides"      "vulgatus"      
## [3,] "Lachnospiraceae" "Agathobacter"     NA              
## [4,] "Ruminococcaceae" "Faecalibacterium" NA              
## [5,] "Ruminococcaceae" "Faecalibacterium" "prausnitzii"   
## [6,] "Lachnospiraceae" "Fusicatenibacter" "saccharivorans"
table(taxa.print[,1]) # Show the different kingdoms (should be only bacteria)
## 
##   Archaea  Bacteria Eukaryota 
##         2       822         3
taxa.print[taxa.print[,1] == 'Eukaryota',2] # the eukaryota are all NAs
## [1] NA NA NA
table(taxa.print[,2]) # Show the different phyla
## 
##  Actinobacteriota      Bacteroidota  Campilobacterota     Cyanobacteria 
##                42               120                 3                 7 
##  Desulfobacterota     Euryarchaeota        Firmicutes    Fusobacteriota 
##                 5                 1               590                 5 
##   Patescibacteria    Proteobacteria     Spirochaetota  Thermoplasmatota 
##                 4                41                 1                 1 
## Verrucomicrobiota 
##                 4
table(is.na(taxa.print[,2])) # is there any NA phyla?
## 
## FALSE  TRUE 
##   824     3

LAST PREPROCESSING

We will remove any sample with less than 500 reads from further analysis, and also any ASVs with unassigned phyla.

1. Create phyloseq object

The preprocessing will be easier to do with ASV, taxonomic and metadata tables combined in a phyloseq object.

#_________________________
# Import metadata
metadata_table <- read.csv(file.path(path, "00_Metadata-Mars/modif_metadata(R).csv")) %>% select(-X)
colnames(metadata_table)
rownames(metadata_table) <- metadata_table$Run

# Add a few covariates
metadata_table$author <- "Mars"
metadata_table$sequencing_tech <- "Illumina"
metadata_table$variable_region <- "V4"

# Remove _F_filt2.fastq.gz from rownames
# rownames(seqtable.nochim)[1:5]
rownames(seqtable.nochim) <- sub("_.*", "", rownames(seqtable.nochim))

#_________________________
# Create phyloseq object
physeq <- phyloseq(otu_table(seqtable.nochim, taxa_are_rows=FALSE), # by default, in otu_table the sequence variants are in rows
                  sample_data(metadata_table), 
                  tax_table(taxa))

# Remove taxa that are eukaryota, or have unassigned Phyla
physeq <- subset_taxa(physeq, Kingdom != "Eukaryota")
physeq <- subset_taxa(physeq, !is.na(Phylum))
# Remove samples with less than 500 reads
physeq <- prune_samples(sample_sums(physeq)>=500, physeq)

2. Save to disk

# Save to disk
saveRDS(raw_stats, file.path(path, "01_Dada2-Mars/raw_stats.rds"))
saveRDS(out2, file.path(path, "01_Dada2-Mars/out2.rds"))
saveRDS(errF, file.path(path, "01_Dada2-Mars/errF.rds"))
saveRDS(errR, file.path(path, "01_Dada2-Mars/errR.rds"))
saveRDS(dadaFs, file.path(path, "01_Dada2-Mars/infered_seq_F.rds"))
saveRDS(dadaRs, file.path(path, "01_Dada2-Mars/infered_seq_R.rds"))
saveRDS(mergers, file.path(path, "01_Dada2-Mars/mergers.rds"))

saveRDS(otu_table(physeq), file.path(path, "01_Dada2-Mars/ASVtable_final.rds")) # ASV table
saveRDS(tax_table(physeq), file.path(path, "01_Dada2-Mars/taxa_final.rds")) # taxa table
saveRDS(sample_data(physeq), file.path(path, "01_Dada2-Mars/metadata_final.rds")) # metadata table

# Taxa & Phyloseq object
saveRDS(taxa, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/CLUSTER/Taxonomy/output/taxa_mars.rds")
saveRDS(physeq, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/CLUSTER/PhyloTree/input/physeq_mars.rds")

3. Quick peek at data analysis

# Absolute abundance
plot_bar(physeq, fill = "Phylum")+ facet_wrap("host_disease", scales="free_x") + theme(axis.text.x = element_blank())

# Relative abundance for Phylum
phylum.table <- physeq %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

ggplot(phylum.table, aes(x = Sample, y = Abundance, fill = Phylum))+
  facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(size = 5, angle = -90))+
  labs(x = "Samples", y = "Relative abundance")